import numpy as np import pandas as pd import string import re GET_WORDS_NUM = 10 PREPROCESS_PATH = './news_preprocessing.txt' """ 取消注释以下代码进行预处理工作, 否则请按照报告说明放置预处理文件复现结果. import nltk from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer DATA_PATH = './news.txt' with open(DATA_PATH, encoding='utf-8') as f: corpus = f.readlines() def preprocessing(text): # :param text: # 对每一行进行预处理. # :return: # 处理后的每一行. text = text.lower() puncs = string.punctuation + '‘“”’—' numbers = '1234567890' for i in puncs + numbers: text = text.replace(i, ' ') text = re.sub(r'\d +', '', text) wordList = nltk.word_tokenize(text) filtered = [w for w in wordList if w not in stopwords.words('english')] # 仅保留名词或特定POS (这里可选) # refiltered = nltk.pos_tag(filtered) # filtered = [w for w, pos in refiltered if pos.startswith('NN')] # 词干化 (这里可选) # ps = PorterStemmer() # filtered = [ps.stem(w) for w in filtered] return " ".join(filtered) from tqdm import tqdm for i, _ in tqdm(enumerate(corpus)): corpus[i] = preprocessing(corpus[i]) # 预处理需要大概一个小时. with open(PREPROCESS_PATH, 'w', encoding='utf-8') as f: for i in corpus: f.write('%s\n' % i) """ with open(PREPROCESS_PATH, encoding='utf-8') as f: corpus = f.readlines() print('数据读取完成!') from sklearn.feature_extraction.text import CountVectorizer # 限定term出现次数必须大于2, 保留前10000个. count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english') word_mat = count_vectorizer.fit_transform(corpus) vocabulary = count_vectorizer.vocabulary_ feature_names = count_vectorizer.get_feature_names() docs = [] for row in word_mat.toarray(): present_words = np.where(row != 0)[0].tolist() present_words_with_count = [] for word_idx in present_words: for count in range(row[word_idx]): present_words_with_count.append(word_idx) docs.append(present_words_with_count) for n_topic in [5, 10, 20]: D, V = len(docs), len(vocabulary) beta = 1 / n_topic alpha = 1 / n_topic z_d_n = [[0 for _ in range(len(d))] for d in docs] theta_d_z = np.zeros((D, n_topic)) phi_z_w = np.zeros((n_topic, V)) n_d = np.zeros((D)) n_z = np.zeros((n_topic)) for d, doc in enumerate(docs): for n, w in enumerate(doc): z_d_n[d][n] = n % n_topic z = z_d_n[d][n] theta_d_z[d][z] += 1 phi_z_w[z, w] += 1 n_z[z] += 1 n_d[d] += 1 for iteration in range(10): for d, doc in enumerate(docs): for n, w in enumerate(doc): z = z_d_n[d][n] theta_d_z[d][z] -= 1 phi_z_w[z, w] -= 1 n_z[z] -= 1 p_d_t = (theta_d_z[d] + alpha) / (n_d[d] - 1 + n_topic * alpha) p_t_w = (phi_z_w[:, w] + beta) / (n_z + V * beta) p_z = p_d_t * p_t_w p_z /= np.sum(p_z) new_z = np.random.multinomial(1, p_z).argmax() z_d_n[d][n] = new_z theta_d_z[d][new_z] += 1 phi_z_w[new_z, w] += 1 n_z[new_z] += 1 inv_vocabulary = {v: k for k, v in vocabulary.items()} # The code here is improved from https://www.depends-on-the-definition.com/lda-from-scratch/ # 但保证已经理解了全部内容. print(f'话题个数为 {n_topic} 时:') for topic_idx, topic in enumerate(phi_z_w): print(f' Topic #{topic_idx}: ', end='') print(', '.join([feature_names[i] for i in topic.argsort()[:-GET_WORDS_NUM - 1:-1]])) print()